Prerequisites

Import required libraries

library("tidyverse")
Registered S3 method overwritten by 'dplyr':
  method           from
  print.rowwise_df     
Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✓ ggplot2 3.3.0     ✓ purrr   0.3.3
✓ tibble  3.0.0     ✓ dplyr   0.8.5
✓ tidyr   1.0.2     ✓ stringr 1.4.0
✓ readr   1.3.1     ✓ forcats 0.5.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
library("dplyr")

Data Wrangling

Read dataset_1

# read data and drop ID col
dataset_1 <- read.csv("../raw_data/dataset_1/winemag-data-130k-v2.csv")[-1]

Read dataset_2

Combine all the csv files for dataset_2

# function reads csv values
readfile <- function(filename){
  cat("Reading file: ", filename, "...\n", sep = '' )
  return(read.csv(file = filename, header = TRUE))
}

# function merges two dataframes together
merge_dataframes <- function(dataframe_1, dataframe_2){
  return(rbind(dataframe_1, dataframe_2))
}

# funciton merges all csv files in folder to one dataframe
combine_data = function(mypath){
  filenames=list.files(path=mypath, full.names=TRUE)
  datalist = lapply(filenames, function(x) readfile(x))
  return(Reduce(f = function(x,y) merge_dataframes(x,y), x = datalist, accumulate = FALSE))
}

# merge csv data given dataset folder
dataset_2 <- combine_data("../raw_data/dataset_2")
Reading file: ../raw_data/dataset_2/winemag-1-800.csv...
Reading file: ../raw_data/dataset_2/winemag-10401-11200.csv...
Reading file: ../raw_data/dataset_2/winemag-11201-12000.csv...
Reading file: ../raw_data/dataset_2/winemag-12001-12800.csv...
Reading file: ../raw_data/dataset_2/winemag-1601-2400.csv...
Reading file: ../raw_data/dataset_2/winemag-2401-3200.csv...
Reading file: ../raw_data/dataset_2/winemag-3201-4000.csv...
Reading file: ../raw_data/dataset_2/winemag-4001-4800.csv...
Reading file: ../raw_data/dataset_2/winemag-4801-5600.csv...
Reading file: ../raw_data/dataset_2/winemag-5601-6400.csv...
Reading file: ../raw_data/dataset_2/winemag-6401-7200.csv...
Reading file: ../raw_data/dataset_2/winemag-7201-8000.csv...
Reading file: ../raw_data/dataset_2/winemag-8001-8800.csv...
Reading file: ../raw_data/dataset_2/winemag-801-1600.csv...
Reading file: ../raw_data/dataset_2/winemag-8801-9600.csv...
Reading file: ../raw_data/dataset_2/winemag-9601-10400.csv...

Join Datasets

Join datasets by title, description, price, country, points = rating. We choose to join by these characteristics, because it would allow us to safely assume that the wine reviews being merged are the same. However, we intentionally left choose not to join based on certain chacteristics like province = region, because there were some input errors in the data. For example, in dataset_1 the province was “Sicily & Sardinia” where as in dataset_2 the region was “Sicily & Sardinia”.

join_wines <- inner_join(dataset_1, dataset_2, by = c("title", "description", "price", "country", "points" = "rating"))
Column `title` joining factors with different levels, coercing to character vectorColumn `description` joining factors with different levels, coercing to character vectorColumn `country` joining factors with different levels, coercing to character vector

Visualize joined data

head(join_wines)

Clean joined data to remove redundancies

join_wines_cleaned <- join_wines %>%
  select(country, description, designation=designation.x, points, price, province, region=region_1, subregion=region_2, taster_name, taster_twitter_handle, title, variety, winery=winery.x, alcohol, category, url, vintage)

head(join_wines_cleaned)

Extract relevant information for our use

wines <- join_wines_cleaned %>%
  select(title, alcohol, category, vintage, designation, country, province, region, subregion,  variety, winery, price, points, taster_name, taster_twitter_handle)

head(wines)

Write to CSV

Joined uncleaned Data

Output uncleaned joined data for others to use

write_csv(join_wines, "joined_datasets.csv")

Joined Cleaned Data

Outout cleaned joined data for others to use

write_csv(join_wines_cleaned, "joined_datasets_cleaned.csv")

Joined Cleaned Data with relevant categories

Output for easy import later

write_csv(wines, "wines.csv")
LS0tCnRpdGxlOiAiUHJlcHJjZXNzaW5nIERhdGEiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KIyBQcmVyZXF1aXNpdGVzCgpJbXBvcnQgcmVxdWlyZWQgbGlicmFyaWVzCmBgYHtyfQpsaWJyYXJ5KCJ0aWR5dmVyc2UiKQpsaWJyYXJ5KCJkcGx5ciIpCmBgYAoKIyBEYXRhIFdyYW5nbGluZwoKIyMgUmVhZCBkYXRhc2V0XzEKCmBgYHtyfQojIHJlYWQgZGF0YSBhbmQgZHJvcCBJRCBjb2wKZGF0YXNldF8xIDwtIHJlYWQuY3N2KCIuLi9yYXdfZGF0YS9kYXRhc2V0XzEvd2luZW1hZy1kYXRhLTEzMGstdjIuY3N2IilbLTFdCmBgYAoKIyMgUmVhZCBkYXRhc2V0XzIKCkNvbWJpbmUgYWxsIHRoZSBjc3YgZmlsZXMgZm9yIGRhdGFzZXRfMgpgYGB7cn0KIyBmdW5jdGlvbiByZWFkcyBjc3YgdmFsdWVzCnJlYWRmaWxlIDwtIGZ1bmN0aW9uKGZpbGVuYW1lKXsKICBjYXQoIlJlYWRpbmcgZmlsZTogIiwgZmlsZW5hbWUsICIuLi5cbiIsIHNlcCA9ICcnICkKICByZXR1cm4ocmVhZC5jc3YoZmlsZSA9IGZpbGVuYW1lLCBoZWFkZXIgPSBUUlVFKSkKfQoKIyBmdW5jdGlvbiBtZXJnZXMgdHdvIGRhdGFmcmFtZXMgdG9nZXRoZXIKbWVyZ2VfZGF0YWZyYW1lcyA8LSBmdW5jdGlvbihkYXRhZnJhbWVfMSwgZGF0YWZyYW1lXzIpewogIHJldHVybihyYmluZChkYXRhZnJhbWVfMSwgZGF0YWZyYW1lXzIpKQp9CgojIGZ1bmNpdG9uIG1lcmdlcyBhbGwgY3N2IGZpbGVzIGluIGZvbGRlciB0byBvbmUgZGF0YWZyYW1lCmNvbWJpbmVfZGF0YSA9IGZ1bmN0aW9uKG15cGF0aCl7CiAgZmlsZW5hbWVzPWxpc3QuZmlsZXMocGF0aD1teXBhdGgsIGZ1bGwubmFtZXM9VFJVRSkKICBkYXRhbGlzdCA9IGxhcHBseShmaWxlbmFtZXMsIGZ1bmN0aW9uKHgpIHJlYWRmaWxlKHgpKQogIHJldHVybihSZWR1Y2UoZiA9IGZ1bmN0aW9uKHgseSkgbWVyZ2VfZGF0YWZyYW1lcyh4LHkpLCB4ID0gZGF0YWxpc3QsIGFjY3VtdWxhdGUgPSBGQUxTRSkpCn0KCiMgbWVyZ2UgY3N2IGRhdGEgZ2l2ZW4gZGF0YXNldCBmb2xkZXIKZGF0YXNldF8yIDwtIGNvbWJpbmVfZGF0YSgiLi4vcmF3X2RhdGEvZGF0YXNldF8yIikKYGBgCgojIyBKb2luIERhdGFzZXRzCgpKb2luIGRhdGFzZXRzIGJ5IGB0aXRsZWAsICBgZGVzY3JpcHRpb25gLCBgcHJpY2VgLCBgY291bnRyeWAsIGBwb2ludHNgID0gYHJhdGluZ2AuIFdlIGNob29zZSB0byBqb2luIGJ5IHRoZXNlIGNoYXJhY3RlcmlzdGljcywgYmVjYXVzZSBpdCB3b3VsZCBhbGxvdyB1cyB0byBzYWZlbHkgYXNzdW1lIHRoYXQgdGhlIHdpbmUgcmV2aWV3cyBiZWluZyBtZXJnZWQgYXJlIHRoZSBzYW1lLiBIb3dldmVyLCB3ZSBpbnRlbnRpb25hbGx5IGxlZnQgY2hvb3NlIG5vdCB0byBqb2luIGJhc2VkIG9uIGNlcnRhaW4gY2hhY3RlcmlzdGljcyBsaWtlIGBwcm92aW5jZWAgPSBgcmVnaW9uYCwgYmVjYXVzZSB0aGVyZSB3ZXJlIHNvbWUgaW5wdXQgZXJyb3JzIGluIHRoZSBkYXRhLiBGb3IgZXhhbXBsZSwgaW4gYGRhdGFzZXRfMWAgdGhlIGBwcm92aW5jZWAgd2FzICJTaWNpbHkgJiBTYXJkaW5pYSIgd2hlcmUgYXMgaW4gYGRhdGFzZXRfMmAgdGhlICBgcmVnaW9uYCB3YXMgIlNpY2lseSAmYW1wOyBTYXJkaW5pYSIuCmBgYHtyfQpqb2luX3dpbmVzIDwtIGlubmVyX2pvaW4oZGF0YXNldF8xLCBkYXRhc2V0XzIsIGJ5ID0gYygidGl0bGUiLCAiZGVzY3JpcHRpb24iLCAicHJpY2UiLCAiY291bnRyeSIsICJwb2ludHMiID0gInJhdGluZyIpKQpgYGAKClZpc3VhbGl6ZSBqb2luZWQgZGF0YQpgYGB7cn0KaGVhZChqb2luX3dpbmVzKQpgYGAKCkNsZWFuIGpvaW5lZCBkYXRhIHRvIHJlbW92ZSByZWR1bmRhbmNpZXMKYGBge3J9CmpvaW5fd2luZXNfY2xlYW5lZCA8LSBqb2luX3dpbmVzICU+JQogIHNlbGVjdChjb3VudHJ5LCBkZXNjcmlwdGlvbiwgZGVzaWduYXRpb249ZGVzaWduYXRpb24ueCwgcG9pbnRzLCBwcmljZSwgcHJvdmluY2UsIHJlZ2lvbj1yZWdpb25fMSwgc3VicmVnaW9uPXJlZ2lvbl8yLCB0YXN0ZXJfbmFtZSwgdGFzdGVyX3R3aXR0ZXJfaGFuZGxlLCB0aXRsZSwgdmFyaWV0eSwgd2luZXJ5PXdpbmVyeS54LCBhbGNvaG9sLCBjYXRlZ29yeSwgdXJsLCB2aW50YWdlKQoKaGVhZChqb2luX3dpbmVzX2NsZWFuZWQpCmBgYAoKCkV4dHJhY3QgcmVsZXZhbnQgaW5mb3JtYXRpb24gZm9yIG91ciB1c2UKYGBge3J9CndpbmVzIDwtIGpvaW5fd2luZXNfY2xlYW5lZCAlPiUKICBzZWxlY3QodGl0bGUsIGFsY29ob2wsIGNhdGVnb3J5LCB2aW50YWdlLCBkZXNpZ25hdGlvbiwgY291bnRyeSwgcHJvdmluY2UsIHJlZ2lvbiwgc3VicmVnaW9uLCAgdmFyaWV0eSwgd2luZXJ5LCBwcmljZSwgcG9pbnRzLCB0YXN0ZXJfbmFtZSwgdGFzdGVyX3R3aXR0ZXJfaGFuZGxlKQoKaGVhZCh3aW5lcykKYGBgCgoKIyBXcml0ZSB0byBDU1YKCiMjIyBKb2luZWQgdW5jbGVhbmVkIERhdGEKCk91dHB1dCB1bmNsZWFuZWQgam9pbmVkIGRhdGEgZm9yIG90aGVycyB0byB1c2UKYGBge3J9CndyaXRlX2Nzdihqb2luX3dpbmVzLCAiam9pbmVkX2RhdGFzZXRzLmNzdiIpCmBgYAoKIyMjIEpvaW5lZCBDbGVhbmVkIERhdGEKCk91dG91dCBjbGVhbmVkIGpvaW5lZCBkYXRhIGZvciBvdGhlcnMgdG8gdXNlCmBgYHtyfQp3cml0ZV9jc3Yoam9pbl93aW5lc19jbGVhbmVkLCAiam9pbmVkX2RhdGFzZXRzX2NsZWFuZWQuY3N2IikKYGBgCgojIyMgSm9pbmVkIENsZWFuZWQgRGF0YSB3aXRoIHJlbGV2YW50IGNhdGVnb3JpZXMKCk91dHB1dCBmb3IgZWFzeSBpbXBvcnQgbGF0ZXIKYGBge3J9CndyaXRlX2Nzdih3aW5lcywgIndpbmVzLmNzdiIpCmBgYAo=